{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "d88d2d5c-905a-4167-ba5f-08791bdeea32",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "cf2ecc78-40e9-4f02-84fa-fae79b93643f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>data</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>50万以上</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10万以下</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10万以下</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>50万以上</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10万以下</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>10万以下</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>30-50万</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>10-30万</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     data\n",
       "0   50万以上\n",
       "1   10万以下\n",
       "2   10万以下\n",
       "3   50万以上\n",
       "4   10万以下\n",
       "5   10万以下\n",
       "6  30-50万\n",
       "7  10-30万"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = ['50万以上', '10万以下', '10万以下', '50万以上',\n",
    "        '10万以下', '10万以下', '30-50万', '10-30万']\n",
    "df = pd.DataFrame({\"data\": data})\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "cc5a3bd4-4217-4e52-be1f-423bbb2db449",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    3\n",
       "1    1\n",
       "2    1\n",
       "3    3\n",
       "4    1\n",
       "5    1\n",
       "6    2\n",
       "7    0\n",
       "dtype: int8"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.data.astype(\"category\").cat.codes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "7275cc8d-c504-4728-b18e-fdad21a79309",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>data</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>50万以上</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10万以下</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10万以下</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>50万以上</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10万以下</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>10万以下</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>30-50万</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>10-30万</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     data\n",
       "0   50万以上\n",
       "1   10万以下\n",
       "2   10万以下\n",
       "3   50万以上\n",
       "4   10万以下\n",
       "5   10万以下\n",
       "6  30-50万\n",
       "7  10-30万"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "57d0239c-94ff-40d1-b46b-5e92fdd80b0e",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.data = pd.factorize(df.data)[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "eb4d8d9b-5d16-4518-b138-7bc6c6b84664",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>data</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   data\n",
       "0     0\n",
       "1     1\n",
       "2     1\n",
       "3     0\n",
       "4     1\n",
       "5     1\n",
       "6     2\n",
       "7     3"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "c421678c-bd16-4acc-8d24-e9ba9b4c1719",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>one</th>\n",
       "      <th>two</th>\n",
       "      <th>tree</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>张三</td>\n",
       "      <td>1.0</td>\n",
       "      <td>开</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>李四</td>\n",
       "      <td>NaN</td>\n",
       "      <td>关</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>张三</td>\n",
       "      <td>3.0</td>\n",
       "      <td>未知</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  one  two tree\n",
       "0  张三  1.0    开\n",
       "1  李四  NaN    关\n",
       "2  张三  3.0   未知"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d = {\"one\":[\"张三\",\"李四\",\"张三\"],\"two\":[1,np.nan,3],\"tree\":[\"开\",\"关\",\"未知\"]}\n",
    "dd = pd.DataFrame(d)\n",
    "dd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "15fbd00a-a7f9-40e4-bcbe-ea3134d64176",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import OneHotEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "acf40dab-d98a-47f3-b990-43c7eec8b95e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([['张三'],\n",
       "       ['李四'],\n",
       "       ['张三']], dtype=object)"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X = dd[\"one\"].to_numpy().reshape(-1,1)\n",
    "X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "af9d070d-da5c-4010-8831-216fa306c667",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1., 0.],\n",
       "       [0., 1.],\n",
       "       [1., 0.]])"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "enc = OneHotEncoder(categories=\"auto\")\n",
    "enc.fit_transform(X).toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "45c46613-6956-4e39-ae5d-b0739e0e31cb",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "d:\\install\\python37\\lib\\site-packages\\sklearn\\utils\\deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.\n",
      "  warnings.warn(msg, category=FutureWarning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array(['x0_张三', 'x0_李四'], dtype=object)"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "enc.get_feature_names()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "ab2dcb29-ff16-4d6d-9879-71fc1dff1ddd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>one</th>\n",
       "      <th>tree</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>张三</td>\n",
       "      <td>开</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>李四</td>\n",
       "      <td>关</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>张三</td>\n",
       "      <td>未知</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  one tree\n",
       "0  张三    开\n",
       "1  李四    关\n",
       "2  张三   未知"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "XX= dd[[\"one\",\"tree\"]]\n",
    "XX"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "06e1d4fe-d745-45ba-9e58-d9f4c9c8f7f8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['one', 'tree'], dtype='object')"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "XX.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "5b675e54-2b92-4db3-b160-8d89c66c9536",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1., 0., 0., 1., 0.],\n",
       "       [0., 1., 1., 0., 0.],\n",
       "       [1., 0., 0., 0., 1.]])"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "enc2 = OneHotEncoder()\n",
    "res = enc2.fit_transform(XX).toarray()\n",
    "res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "ebdfbea5-2831-4f14-9468-789b1c9dfa3d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['one_张三', 'one_李四', 'tree_关', 'tree_开', 'tree_未知'], dtype=object)"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "enc2.get_feature_names_out()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "213569df-4e95-44bd-a466-385d8d3d4625",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([['张三', '开'],\n",
       "       ['李四', '关'],\n",
       "       ['张三', '未知']], dtype=object)"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "enc2.inverse_transform(res)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "52e5e6d8-570c-45fb-95ff-366d79a31c2d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>two</th>\n",
       "      <th>one_张三</th>\n",
       "      <th>one_李四</th>\n",
       "      <th>tree_关</th>\n",
       "      <th>tree_开</th>\n",
       "      <th>tree_未知</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   two  one_张三  one_李四  tree_关  tree_开  tree_未知\n",
       "0  1.0     1.0     0.0     0.0     1.0      0.0\n",
       "1  NaN     0.0     1.0     1.0     0.0      0.0\n",
       "2  3.0     1.0     0.0     0.0     0.0      1.0"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.concat([dd,pd.DataFrame(res,columns=enc2.get_feature_names_out())],axis=1).drop(XX.columns,axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e9f13501-54e7-4fba-b857-34a3bf496fed",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
